Reran by Sagy on Sep 15 (2025) - removing CD41 and FUS lines
import io
import os
import sys
import pandas as pd
import contextlib
from IPython.display import display, Javascript
NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'NIH', 'indi-image-pilot-20241128')
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct', 'NIH')
LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "NIH", "logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', "ManuscriptFinalData_80pct", 'NIH', 'QC_figures')
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
run_calc_hist_new, show_total_valid_tiles_per_marker_and_batch
from tools.preprocessing_tools.qc_reports.qc_config import NIH_d8_panels, NIH_d8_markers, NIH_d8_marker_info, NIH_d8_cell_lines, NIH_d8_cell_lines_to_cond,\
NIH_d8_cell_lines_for_disp, NIH_d8_reps, NIH_d8_line_colors, NIH_d8_lines_order, NIH_d8_custom_palette,\
NIH_d8_expected_dapi_raw
%load_ext autoreload
%autoreload 2
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
# choose batches
batches = [f'batch{i}' for i in range(1,4)]
batches
['batch1', 'batch2', 'batch3']
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches, filename_split='-',site_location=0)
#df['cell_line_cond'] = df['cell_line_cond'].str.replace(" ", "_")
df = df[df['cell_line'] == 'WT']
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch3 reading logs of batch2 reading logs of batch1 Total of 3 files were read. Before dup handeling (124974, 21) After duplication removal #1: (124974, 22) After duplication removal #2: (124974, 22)
raws = run_validate_folder_structure(root_directory_raw, False,
NIH_d8_panels,
NIH_d8_markers.copy(),
PLOT_PATH,
NIH_d8_marker_info,
NIH_d8_cell_lines_to_cond,
NIH_d8_reps,
NIH_d8_cell_lines_for_disp,
NIH_d8_expected_dapi_raw,
batches=batches,
fig_width=8, fig_height = 40,
expected_count=25, check_antibody=False)
batch1
Folder structure is valid.
No bad files are found.
Total Sites: 17200
df_reset (208, 3) colored_df (208, 3)
Rep WT stress WT Untreated
Marker
ANAX11 rep1 25 25
ANAX11 rep2 25 25
ANAX11 rep3 25 25
ANAX11 rep4 25 25
ANAX11 rep5 25 25
... ... ... ...
DAPI rep4 275 275
DAPI rep5 275 275
DAPI rep6 275 275
DAPI rep7 275 275
DAPI rep8 275 275
[208 rows x 3 columns]
========
batch2
Folder structure is valid.
No bad files are found.
Total Sites: 17200
df_reset (208, 3) colored_df (208, 3)
Rep WT stress WT Untreated
Marker
ANAX11 rep1 25 25
ANAX11 rep2 25 25
ANAX11 rep3 25 25
ANAX11 rep4 25 25
ANAX11 rep5 25 25
... ... ... ...
DAPI rep4 275 275
DAPI rep5 275 275
DAPI rep6 275 275
DAPI rep7 275 275
DAPI rep8 275 275
[208 rows x 3 columns]
========
batch3
Folder structure is valid.
No bad files are found.
Total Sites: 17200
df_reset (208, 3) colored_df (208, 3)
Rep WT stress WT Untreated
Marker
ANAX11 rep1 25 25
ANAX11 rep2 25 25
ANAX11 rep3 25 25
ANAX11 rep4 25 25
ANAX11 rep5 25 25
... ... ... ...
DAPI rep4 275 275
DAPI rep5 275 275
DAPI rep6 275 275
DAPI rep7 275 275
DAPI rep8 275 275
[208 rows x 3 columns]
======== ====================
## Missing data issue was fixed
differences = (raws[0] != raws[1]).stack()
differences = differences[differences].index.to_frame(index=False)
differences.columns = ["Marker", "Rep", "Condition"]
for condition in differences["Condition"].unique():
print(f"Condition: {condition}")
condition_data = differences[differences["Condition"] == condition]
for rep in condition_data["Rep"].unique():
markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
print(f" Rep: {rep}")
print(f" Markers: {', '.join(markers)}")
procs = run_validate_folder_structure(root_directory_proc, True,
NIH_d8_panels,
NIH_d8_markers,
PLOT_PATH,
NIH_d8_marker_info,
NIH_d8_cell_lines_to_cond,
NIH_d8_reps,
NIH_d8_cell_lines_for_disp,
NIH_d8_expected_dapi_raw,
fig_width=8, fig_height=40,
expected_count=25,
check_antibody=False,
batches=batches)
batch1
Folder structure is valid.
No bad files are found.
Total Sites: 16575
df_reset (208, 3) colored_df (208, 3)
Rep WT stress WT Untreated
Marker
ANAX11 rep1 22 25
ANAX11 rep2 25 25
ANAX11 rep3 25 25
ANAX11 rep4 25 25
ANAX11 rep5 24 25
... ... ... ...
DAPI rep4 275 275
DAPI rep5 272 273
DAPI rep6 273 273
DAPI rep7 274 274
DAPI rep8 275 275
[208 rows x 3 columns]
========
batch2
Folder structure is valid.
No bad files are found.
Total Sites: 16336
df_reset (208, 3) colored_df (208, 3)
Rep WT stress WT Untreated
Marker
ANAX11 rep1 23 25
ANAX11 rep2 24 22
ANAX11 rep3 25 25
ANAX11 rep4 25 25
ANAX11 rep5 25 25
... ... ... ...
DAPI rep4 274 275
DAPI rep5 274 275
DAPI rep6 275 275
DAPI rep7 270 275
DAPI rep8 272 274
[208 rows x 3 columns]
========
batch3
Folder structure is valid.
No bad files are found.
Total Sites: 16326
df_reset (208, 3) colored_df (208, 3)
Rep WT stress WT Untreated
Marker
ANAX11 rep1 23 24
ANAX11 rep2 24 22
ANAX11 rep3 24 25
ANAX11 rep4 25 24
ANAX11 rep5 23 24
... ... ... ...
DAPI rep4 273 267
DAPI rep5 261 275
DAPI rep6 269 270
DAPI rep7 274 274
DAPI rep8 275 272
[208 rows x 3 columns]
======== ====================
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height=40)
batch1
======== batch2
======== batch3
========
for batch in batches[:1]:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc,
batch,
sample_size_per_markers=50,
cond_count=2,
rep_count=len(NIH_d8_reps),
num_markers=len(NIH_d8_markers))
print(f'{batch} var: ',var)
batch1 var: 0.02988291276091414
By order of filtering
Percentage out of the total sites
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,
batches,
NIH_d8_line_colors,
NIH_d8_panels,
NIH_d8_reps,
figsize=(6,18),
vmax=25)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi,
batches,
dapi_filter_by_brenner,
NIH_d8_line_colors,
NIH_d8_panels,
NIH_d8_reps,
figsize=(6,18))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi,
batches,
dapi_filter_by_cellpose,
NIH_d8_line_colors,
NIH_d8_panels,
NIH_d8_reps,
figsize=(6,18))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
show_site_survival_target_brenner(df_dapi,
df_target,
dapi_filter_by_tiling,
NIH_d8_markers,
figsize=(6,18))
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, NIH_d8_markers)
# markers_for_dnls = markers.copy() #TODO need to change according to - if we use all markers or just the d8 ones!!!!
# markers_for_dnls.remove('TIA1')
# markers_for_dnls += ['TDP43B']
total_sum[total_sum.marker.isin(NIH_d8_markers)].n_valid_tiles.sum()
590634
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
110373.0
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
347157.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch1 | ||||
| count | 688.000000 | 688.000000 | 688.000000 | 688.000000 |
| mean | 332.860465 | 3.328605 | 237.784884 | 761.949128 |
| std | 56.413226 | 0.564132 | 50.451934 | 135.014222 |
| min | 80.000000 | 0.800000 | 56.000000 | 155.000000 |
| 25% | 301.000000 | 3.010000 | 207.000000 | 699.000000 |
| 50% | 336.000000 | 3.360000 | 235.500000 | 768.000000 |
| 75% | 369.000000 | 3.690000 | 268.250000 | 852.000000 |
| max | 466.000000 | 4.660000 | 360.000000 | 1102.000000 |
| sum | 229008.000000 | NaN | 163596.000000 | 524221.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch2 | ||||
| count | 687.000000 | 687.000000 | 687.000000 | 687.000000 |
| mean | 269.903930 | 2.699039 | 189.056769 | 590.714702 |
| std | 50.685184 | 0.506852 | 38.856257 | 116.194745 |
| min | 4.000000 | 0.040000 | 3.000000 | 11.000000 |
| 25% | 242.000000 | 2.420000 | 167.000000 | 529.000000 |
| 50% | 271.000000 | 2.710000 | 191.000000 | 600.000000 |
| 75% | 299.000000 | 2.990000 | 212.000000 | 657.500000 |
| max | 385.000000 | 3.850000 | 290.000000 | 918.000000 |
| sum | 185424.000000 | NaN | 129882.000000 | 405821.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch3 | ||||
| count | 687.000000 | 687.000000 | 687.000000 | 687.0000 |
| mean | 256.480349 | 2.564803 | 179.742358 | 556.9869 |
| std | 50.003112 | 0.500031 | 39.653665 | 112.6077 |
| min | 19.000000 | 0.190000 | 7.000000 | 37.0000 |
| 25% | 231.000000 | 2.310000 | 155.000000 | 494.5000 |
| 50% | 265.000000 | 2.650000 | 186.000000 | 578.0000 |
| 75% | 287.000000 | 2.870000 | 207.000000 | 624.0000 |
| max | 378.000000 | 3.780000 | 273.000000 | 818.0000 |
| sum | 176202.000000 | NaN | 123483.000000 | 382650.0000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.0000 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 2062.000000 | 2062.000000 | 2062.000000 | 2.062000e+03 |
| mean | 286.437439 | 2.864374 | 202.211930 | 6.366111e+02 |
| std | 62.111166 | 0.621112 | 50.227316 | 1.511637e+02 |
| min | 4.000000 | 0.040000 | 3.000000 | 1.100000e+01 |
| 25% | 253.000000 | 2.530000 | 170.000000 | 5.420000e+02 |
| 50% | 284.000000 | 2.840000 | 201.000000 | 6.290000e+02 |
| 75% | 329.000000 | 3.290000 | 231.000000 | 7.330000e+02 |
| max | 466.000000 | 4.660000 | 360.000000 | 1.102000e+03 |
| sum | 590634.000000 | NaN | 416961.000000 | 1.312692e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
For each batch, cell line, replicate and marker: Total number of tiles
show_total_valid_tiles_per_marker_and_batch(total_sum, vmax=15000)
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap,
extra_index='marker',
vmin=None, vmax=None,
xlabel = 'Total number of tiles',
show_sum=True, figsize=(7,28),
fmt=".0f")
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
For each batch, cell line, replicate and markerTotal number of tiles
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap,
extra_index='marker',
vmin=None, vmax=None,
xlabel = 'Total number of whole cells',
show_sum=True,
figsize=(7,28),
fmt=".0f")
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites,
NIH_d8_lines_order,
NIH_d8_custom_palette,
y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)',
figsize=(16,6))
plot_cell_count(df_no_empty_sites,
NIH_d8_lines_order,
NIH_d8_custom_palette,
y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site',
figsize=(16,6))
plot_cell_count(df_no_empty_sites,
NIH_d8_lines_order,
NIH_d8_custom_palette,
y='site_cell_count',
title='Cellpose Cell Count Average per Site',
figsize=(16,6))
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond WT Untreated 11.889007 WT stress 11.968573 Name: n_valid_tiles, dtype: float64
df_dapi[['site_cell_count']].mean()
site_cell_count 26.518753 dtype: float64
# plot_catplot(df_dapi,
# NIH_d8_custom_palette,
# NIH_d8_reps,
# x='n_valid_tiles',
# x_title='valid tiles count',
# batch_min=1,
# batch_max=3,
# height=6)
plot_hm_of_mean_cell_count_per_tile(df_dapi,
split_by='rep',
rows='cell_line_cond',
columns='panel',
figsize=(18,6))
# for batch in batches:
# print(batch)
# run_calc_hist_new(f'{batch}', dnls_opera_cell_lines_for_disp, dnls_opera_markers,
# root_directory_raw, root_directory_proc,
# hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
# print("="*30)
# # save notebook as HTML
# from IPython.display import display, Javascript
# display(Javascript('IPython.notebook.save_checkpoint();'))
# os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_NIH_NeuronsDay8.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData/qc_report_NIH_NeuronsDay8.html')